import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
path='../ml_dataset/2018_kdd_cup_dataset/'
pd.read_csv(path+"beijing_17_18_aq.csv")
pd.read_csv(path+"beijing_201802_201803_aq.csv")
beji_aqi_sta=pd.read_csv(path+"Beijing_AirQuality_Stations_en.xlsx")
Lodon_aqi=pd.read_csv(path+"London_historical_aqi_forecast_stations_20180331.csv")
Lodon_aqi
Lodon_aqi.isnull().sum()
#import xlrd
import csv
import pandas as pd
def xlsx_to_csv_pd():
data_xls = pd.read_excel(path+'Beijing_AirQuality_Stations_en.xlsx', index_col=0)
data_xls.to_csv('Beijing_AirQuality_Stations.csv', encoding='utf-8')
if __name__ == '__main__':
xlsx_to_csv_pd()
pd.read_csv(path+"London_historical_aqi_other_stations_20180331.csv")
Geograhy
pd.read_csv(path+"London_grid_weather_station.csv")
pd.read_csv(path+"Beijing_grid_weather_station.csv")
pd.read_csv(path+"London_grid_weather_station.csv")
pd.read_csv(path+"Beijing_historical_meo_grid.csv")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
beijingAqCsv = pd.read_csv('../ml_dataset/2018_kdd_cup_dataset/beijing_17_18_aq.csv')
air_station_beij = beijingAqCsv.groupby("stationId", sort=False)
beijingAqCsv_sta = pd.read_csv('Beijing_AirQuality_Stations.csv')
beijingAqCsv_sta
import folium
list(air_station_beij.groups.keys())
location_beijin={}
for i in range(0,len(beijingAqCsv_sta)):
temp=beijingAqCsv_sta.iloc[i,:]
if temp['Pollutant Species'] in list(air_station_beij.groups.keys()):
location_beijin[temp['Pollutant Species']]=temp.values[1:].astype(float).tolist()[::-1]
location_beijin
#del temp
temp=beijingAqCsv_sta.loc[beijingAqCsv_sta['Pollutant Species']==list(air_station_beij.groups.keys())[0]].iloc[:,1:].values[0]
temp
temp=temp.astype(float).tolist()
temp=temp[::-1]
map_beijin_1 = folium.Map(location=temp, zoom_start=9)
for key in location_beijin:
folium.Marker(location=location_beijin[key]).add_to(map_beijin_1)
map_beijin_1
temp=beijingAqCsv_sta.loc[beijingAqCsv_sta['Pollutant Species']==list(air_station_beij.groups.keys())[0]].iloc[:,1:].values[0]
temp=temp.astype(float).tolist()
temp=temp[::-1]
map_beijin_1 = folium.Map(location=temp, zoom_start=9,tiles='Stamen Terrain')
for key in location_beijin:
folium.Marker(location=location_beijin[key]).add_to(map_beijin_1)
map_beijin_1
beijing_grid_sta=pd.read_csv(path+"Beijing_historical_meo_grid.csv")
beijing_grid_sta
beiji_grid={}
#faster in kmeans
from sklearn.cluster import MiniBatchKMeans
kmeans = MiniBatchKMeans(n_clusters=len(location_beijin), batch_size=1000).fit(beijing_grid_sta[['latitude','longitude']])
beijing_grid_sta.loc[:, 'label'] = kmeans.labels_
map_beijing_2 = folium.Map(location=beijing_grid_sta[['latitude','longitude']].iloc[0].values.tolist(),
zoom_start=9)
for label in kmeans.cluster_centers_:
folium.Marker(location=[label][0]).add_to(map_beijing_2)
map_beijing_2
map_beijing_2 = folium.Map(location=beijing_grid_sta[['latitude','longitude']].iloc[0].values.tolist(),
zoom_start=9,tiles='Stamen Terrain')
for label in kmeans.cluster_centers_:
folium.Marker(location=[label][0]).add_to(map_beijing_2)
map_beijing_2
map_heat_ground_beijing_1 = folium.Map(location=beijing_grid_sta[['latitude','longitude']].iloc[0].values.tolist(),
zoom_start =6, attr='USGS style')
import folium
from folium import plugins
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
#m = folium.Map([,], control_scale = True, zoom_start=11)
#plugins.HeatMap(data, radius = 20, min_opacity = 0.1, max_val = 50,gradient={.2: 'blue', .5: 'lime', 1: 'red'}).add_to(m)
lats = np.array([float(row['latitude'])for index, row in beijing_grid_sta.iloc[:1000].iterrows() ])
lats
#prevent too large
lats = np.array([float(row['latitude'])for index, row in beijing_grid_sta.iloc[:10000].iterrows() ])
lons = np.array([float(row['longitude']) for index, row in beijing_grid_sta.iloc[:10000].iterrows() ])
mag = np.array([float(row['temperature']) for index, row in beijing_grid_sta.iloc[:10000].iterrows() ])
time=np.array([row['utc_time'] for index, row in beijing_grid_sta.iloc[:10000].iterrows() ])
data=np.vstack((lats,lons,mag,time))
data=data.T
data[:,:3]
set(time)
list(set(time))
init_=[]
for i in range(0,len(set(time))):
init_.append([])
init_
for i in range(0,len(data)):
index_value=list(set(time)).index(data[i,3])
init_[index_value].append(data[i,:3].astype(float).tolist())
len(init_)
# List comprehension to make out list of lists
colormap ={.4: 'blue', .65: 'lime', 1: 'red'}
# Plot it on the map
hm = plugins.HeatMapWithTime(init_,radius = 15)
hm.add_to(map_heat_ground_beijing_1)
#plugins.HeatMap(init_, radius = 20,gradient=colormap).add_to(map_heat_ground_1_2)
# Display the map
map_heat_ground_beijing_1
LondonAqCsv_sta = pd.read_csv(path+'London_AirQuality_Stations.csv')
LondonAqCsv_sta
location_Lodon={}
for i in range(0,len(LondonAqCsv_sta)):
temp_name=LondonAqCsv_sta['Unnamed: 0'].iloc[i]
temp_Latitude=LondonAqCsv_sta['Latitude'].iloc[i]
temp_Longitude=LondonAqCsv_sta['Longitude'].iloc[i]
location_Lodon[temp_name]=[temp_Latitude,temp_Longitude]
location_Lodon
#del temp
col_list=['Latitude','Longitude']
temp_pd=LondonAqCsv_sta[col_list]
temp_pd.iloc[0].values.tolist()
map_london_1 = folium.Map(location=temp_pd.iloc[0].values.tolist(), zoom_start=9)
for key in location_Lodon:
folium.Marker(location=location_Lodon[key]).add_to(map_london_1)
map_london_1
map_london_1= folium.Map(location=temp_pd.iloc[0].values.tolist(), zoom_start=9,tiles='Stamen Terrain')
for key in location_Lodon:
folium.Marker(location=location_Lodon[key]).add_to(map_london_1)
map_london_1
beji_aqi_2018=pd.read_csv(path+"beijing_201802_201803_aq.csv")
beji_aqi_2017_2018=pd.read_csv(path+"beijing_17_18_aq.csv")
beji_aqi_summary=pd.concat([beji_aqi_2018,beji_aqi_2017_2018],axis=0).sort_values(by=['utc_time'])
beji_aqi_summary=beji_aqi_summary.set_index([list(range(0,len(beji_aqi_summary)))])
beji_aqi_summary
len(beji_aqi_summary.groupby(['utc_time']))
beji_aqi_summary.isnull().sum()/len(beji_aqi_summary)
import gc
gc.collect()
for key in beji_aqi_summary.groupby(['stationId']).groups:
plt.clf()
plt.figure(figsize=(20,10))
plt.title(key,fontsize=18)
temp_data=beji_aqi_summary.iloc[beji_aqi_summary.groupby(['stationId']).groups[key]]
plt.plot(temp_data['PM2.5'])
#plt.xtickets(temp_data['utc_time'], fontsize=15)
plt.show()
#missing values ratio
beijin_missing_pm25=[]
for key in beji_aqi_summary.groupby(['stationId']).groups:
temp_data=beji_aqi_summary.iloc[beji_aqi_summary.groupby(['stationId']).groups[key]]
ratio=float(temp_data['PM2.5'].isnull().sum()/len(temp_data))
if ratio > 0.15:
print(key)
beijin_missing_pm25.append(key)
temp=beijingAqCsv_sta.loc[beijingAqCsv_sta['Pollutant Species']==list(air_station_beij.groups.keys())[0]].iloc[:,1:].values[0]
temp=temp.astype(float).tolist()
temp=temp[::-1]
map_beijin_1 = folium.Map(location=temp, zoom_start=9,tiles='Stamen Terrain')
for key in location_beijin:
if key in beijin_missing_pm25:
folium.Marker(location=location_beijin[key],
popup='Missing value ratio > 0.15 Location',
icon=folium.Icon(color='red',icon='info-sign')).add_to(map_beijin_1)
else:
folium.Marker(location=location_beijin[key]).add_to(map_beijin_1)
map_beijin_1
beijing_grid_sta=pd.read_csv(path+"Beijing_historical_meo_grid.csv")
beijing_grid_sta.columns
beijing_grid_sta.iloc[beijing_grid_sta.groupby(['stationName']).groups['beijing_grid_648'][0],1:3].values
beijing_grid_sta
len(beijing_grid_sta.groupby(['utc_time']))
beiji_grid
from sklearn.cluster import MiniBatchKMeans
kmeans = MiniBatchKMeans(n_clusters=len(location_beijin), batch_size=1000).fit(beijing_grid_sta[['latitude','longitude']])
beijing_grid_sta.loc[:, 'label'] = kmeans.labels_
for label in kmeans.cluster_centers_:
folium.Marker(location=[label][0],popup='grid point',
icon=folium.Icon(color='green',icon='info-sign')).add_to(map_beijin_1)
map_beijin_1
#wheather
len(beijing_grid_sta.groupby(['stationName']).groups)
len(beijing_grid_sta)
#feature testing
#from sklearn.linear_model import LinearRegression
#for keys in
#LinearRegression()
a = set(beji_aqi_summary.groupby(["utc_time"]).groups.keys())
b=set(beijing_grid_sta.groupby(["utc_time"]).groups.keys())
import matplotlib_venn as venn
common_time_list=list(set(a).intersection(b))
plt.title("Common time slice")
venn.venn2([a,b],
set_labels=("time slice in AQI station"," time slice in weather grid station"))
plt.show()
we need to ensure the time is continue or not
exclusive=(a ^ b)
beji_aqi_summary_copy=beji_aqi_summary.copy() #a
beijing_grid_sta_copy=beijing_grid_sta.copy() #b
first_time_1=1
first_time_2=1
contaniner_1=[]
contaniner_2=[]
for timeslice in exclusive:
if timeslice in a:
if first_time_1 is 1:
first_time_1=0
contaniner_1=list(beji_aqi_summary.groupby(["utc_time"]).groups[timeslice])
else:
contaniner_1=contaniner_1+list(beji_aqi_summary.groupby(["utc_time"]).groups[timeslice])
if timeslice in b:
if first_time_2 is 1:
first_time_2=0
contaniner_2=list(beijing_grid_sta.groupby(["utc_time"]).groups[timeslice])
else:
contaniner_2=contaniner_2+list(beijing_grid_sta.groupby(["utc_time"]).groups[timeslice])
contaniner_1.sort()
contaniner_2.sort()
beji_aqi_summary_copy.drop(beji_aqi_summary_copy.index[contaniner_1]).sort_values(by=['utc_time'])
len(beji_aqi_summary_copy.drop(beji_aqi_summary_copy.index[contaniner_1]).groupby(['utc_time']).groups)
len(beijing_grid_sta_copy.drop(beijing_grid_sta_copy.index[contaniner_2]).groupby(['utc_time']).groups)
len(beji_aqi_summary_copy)
len(contaniner_1)/35
len(beji_aqi_summary_copy.groupby(['stationId']).groups)
len(beji_aqi_summary_copy.drop(beji_aqi_summary_copy.index[contaniner_1]).groupby(['stationId']).groups)
len(beji_aqi_summary_copy.drop(beji_aqi_summary_copy.index[contaniner_1]))/35
len(beji_aqi_summary_copy.drop(beji_aqi_summary_copy.index[contaniner_1]))/len(common_time_list)
There might be some inconsistency!!
Seems like start from 2017-01-01 14:00:00 to 2018-03-27 05:00:00
(len(common_time_list)-10-6)%24
OOps!! the time slice is not continue, there miss some time slice in the data, because module should be 0
from pandas.plotting import autocorrelation_plot
beji_aqi_summary_copy=beji_aqi_summary_copy.sort_values(by=["utc_time"])
from matplotlib import pyplot,rcParams
col=['PM2.5','PM10','NO2','CO','O3','SO2']
def autocorr(x, t=1):
#if
np.corrcoef(np.array([x[0:len(x)-t], x[t:len(x)]]))
index_set=beji_aqi_summary_copy.groupby(['stationId']).groups
rcParams.update({'font.size': 24})
for keys in index_set.keys():
#plt.plot(autocorr(beji_aqi_summary_copy[item].fillna(0).values, t=1))
print(keys)
plt.clf()
plt.figure(figsize=(40,25))
for item in col:
autocorrelation_plot(beji_aqi_summary_copy[item].fillna(0).iloc[index_set[keys]],label=item)
plt.legend(loc='upper right')
pyplot.show()
#plt.show()
import seaborn as sns
for keys in index_set.keys():
#plt.plot(autocorr(beji_aqi_summary_copy[item].fillna(0).values, t=1))
print(keys)
plt.clf()
plt.figure(figsize=(10,10))
corr = beji_aqi_summary_copy[col].fillna(0).iloc[index_set[keys]].corr()
sns.heatmap(corr)
pyplot.show()
pd.read_csv(path+"London_historical_aqi_forecast_stations_20180331.csv").iloc[:,1:]
pd.read_csv(path+"London_historical_aqi_other_stations_20180331.csv").iloc[:,:5]
London_sta=pd.read_csv(path+"London_historical_aqi_forecast_stations_20180331.csv").iloc[:,1:]
London_sta_other=pd.read_csv(path+"London_historical_aqi_other_stations_20180331.csv").iloc[:,:5]
London_sta_other.loc[~London_sta_other["Station_ID"].isnull(),:]
London_sta_other=London_sta_other.loc[~London_sta_other["Station_ID"].isnull(),:]
London_sta_other.columns
#change to lower char
London_sta_other.columns = London_sta_other.columns.str.lower()
London_sta.columns = London_sta.columns.str.lower()
London_sta_other.columns==London_sta.columns
London_sta
London_sta_other
#columnsTitles=["measurementdategmt","station_id"]
London_sta_other.reindex(columns=list(London_sta.columns))
London_sta_other=London_sta_other.reindex(columns=list(London_sta.columns))
pd.concat([London_sta_other,London_sta],axis=0)
London_comb=pd.concat([London_sta_other,London_sta],axis=0).sort_values(by=['measurementdategmt'])
London_comb=London_comb.set_index([list(range(0,len(London_comb)))])
London_comb.isnull().sum()/len(London_comb)
import gc
gc.collect()
for key in London_comb.groupby(['station_id']).groups:
plt.clf()
plt.figure(figsize=(20,10))
plt.title(key,fontsize=18)
temp_data=London_comb.iloc[London_comb.groupby(['station_id']).groups[key]]
plt.plot(temp_data['pm2.5 (ug/m3)'])
#plt.xtickets(temp_data['utc_time'], fontsize=15)
plt.show()
#missing values ratio
London_missing_pm25=[]
for key in London_comb.groupby(['station_id']).groups:
temp_data=London_comb.iloc[London_comb.groupby(['station_id']).groups[key]]
ratio=float(temp_data['pm2.5 (ug/m3)'].isnull().sum()/len(temp_data))
if ratio > 0.15:
print(key)
London_missing_pm25.append(key)
map_london_1= folium.Map(location=temp_pd.iloc[0].values.tolist(), zoom_start=9,tiles='Stamen Terrain')
for key in location_Lodon:
if key in London_missing_pm25:
folium.Marker(location=location_Lodon[key],
popup='Missing value ratio > 0.15 Location',
icon=folium.Icon(color='red',icon='info-sign')).add_to(map_london_1)
else:
folium.Marker(location=location_Lodon[key]).add_to(map_london_1)
map_london_1
London_grid_sta=pd.read_csv(path+"London_grid_weather_station.csv")
London_grid_sta
London_grid_sta_weather=pd.read_csv(path+"London_historical_meo_grid.csv")
London_grid_sta_weather
import re
def split_time(x):
listofx=re.split('\-| |:',x)
if listofx[3][0]=="0":
num_3=listofx[3][1]
else:
num_3=listofx[3]
if listofx[2][0]=="0":
num_2=listofx[2][1]
else:
num_2=listofx[2]
if listofx[1][0]=="0":
num_1=listofx[1][1]
else:
num_1=listofx[1]
time_word=listofx[0]+"/"+num_1+"/"+num_2+" "+num_3+":"+"00"
return time_word
London_grid_sta_weather["utc_time"].apply(lambda x: split_time(str(x)))
London_grid_sta_weather["utc_time"]=London_grid_sta_weather["utc_time"].apply(lambda x: split_time(str(x)))
2018/4/1 0:00, format of the measurementdategmt
from sklearn.cluster import MiniBatchKMeans
kmeans = MiniBatchKMeans(n_clusters=len(location_Lodon), batch_size=1000).fit(London_grid_sta_weather[['latitude','longitude']])
London_grid_sta_weather.loc[:, 'label'] = kmeans.labels_
for label in kmeans.cluster_centers_:
folium.Marker(location=[label][0],popup='grid point',
icon=folium.Icon(color='green',icon='info-sign')).add_to(map_london_1)
map_london_1
a = set(London_comb.groupby(["measurementdategmt"]).groups.keys())
b = set(London_grid_sta_weather.groupby(["utc_time"]).groups.keys())
import matplotlib_venn as venn
common_time_list=list(set(a).intersection(b))
plt.title("Common time slice")
venn.venn2([a,b],
set_labels=("time slice in AQI station"," time slice in weather grid station"))
plt.show()
exclusive=(a ^ b)
London_comb_copy=London_comb.copy() #a
London_grid_sta_weather_copy=London_grid_sta_weather.copy() #b
first_time_1=1
first_time_2=1
contaniner_1=[]
contaniner_2=[]
for timeslice in exclusive:
if timeslice in a:
if first_time_1 is 1:
first_time_1=0
contaniner_1=list(London_comb.groupby(["measurementdategmt"]).groups[timeslice])
else:
contaniner_1=contaniner_1+list(London_comb.groupby(["measurementdategmt"]).groups[timeslice])
if timeslice in b:
if first_time_2 is 1:
first_time_2=0
contaniner_2=list(London_grid_sta_weather.groupby(["utc_time"]).groups[timeslice])
else:
contaniner_2=contaniner_2+list(London_grid_sta_weather.groupby(["utc_time"]).groups[timeslice])
contaniner_1.sort()
contaniner_2.sort()
London_comb.drop(London_comb.index[contaniner_1]).sort_values(by=["measurementdategmt"])
len(London_comb.drop(London_comb.index[contaniner_1]).groupby(["measurementdategmt"]).groups)
len(London_grid_sta_weather.drop(London_grid_sta_weather.index[contaniner_2]).groupby(["utc_time"]).groups)
len(exclusive)
len(London_comb.groupby(["station_id"]).groups)
len(London_comb.drop(London_comb.index[contaniner_1]).groupby(["station_id"]).groups)
len(London_comb.drop(London_comb.index[contaniner_1]))/24
len(London_comb.drop(London_comb.index[contaniner_1]))/len(common_time_list)
len(London_grid_sta_weather.drop(London_grid_sta_weather.index[contaniner_2]).groupby(["stationName"]).groups)
len(contaniner_2)%861
Seems like from 2017/1/1 0:00 to 2018/3/9 9:00, but AQI station data has some inconsistency
(len(common_time_list)-10)%24
OOps the time slice is not continuous, that is 0 for module
from pandas.plotting import autocorrelation_plot
London_comb=London_comb.sort_values(by=["measurementdategmt"])
from matplotlib import pyplot
col=['pm2.5 (ug/m3)','pm10 (ug/m3)','no2 (ug/m3)']
def autocorr(x, t=1):
#if
np.corrcoef(np.array([x[0:len(x)-t], x[t:len(x)]]))
index_set=London_comb.groupby(['station_id']).groups
rcParams.update({'font.size': 24})
for keys in index_set.keys():
#plt.plot(autocorr(beji_aqi_summary_copy[item].fillna(0).values, t=1))
print(keys)
plt.clf()
plt.figure(figsize=(40,25))
for item in col:
autocorrelation_plot(London_comb[item].fillna(0).iloc[index_set[keys]],label=item)
plt.legend(loc='upper right')
pyplot.show()
import seaborn as sns
for keys in index_set.keys():
#plt.plot(autocorr(beji_aqi_summary_copy[item].fillna(0).values, t=1))
print(keys)
plt.clf()
plt.figure(figsize=(7,7))
corr = London_comb[col].fillna(0).iloc[index_set[keys]].corr()
sns.heatmap(corr)
pyplot.show()